.text

.globl	_MMX_BilinearInterpolationRGB555
.globl	_MMX_ScaleLine2x
.globl	_MMX_Scale2x


.EQU	ARG1,	4
.EQU	ARG2,	8
.EQU	ARG3,	12
.EQU	ARG4,	16
.EQU	ARG5,	20
.EQU	ARG6,	24
.EQU	ARG7,	28
.EQU	ARG8,	32
.EQU	ARG9,	36
.EQU	ARG10,	40


/*-----------------------------------------------------------------------------
void
MMX_ScaleLine2x(
	Uint16*		pDst,
	Uint16*		pSrc,
	Sint32		srcW);


Note: 27.7% faster than C code.
-----------------------------------------------------------------------------*/
.align 4, 0x90
_MMX_ScaleLine2x:
	pushl		%esi	/* %esp -= 4 */
	pushl		%edi	/* %esp -= 4 */

	movl		ARG1+8(%esp), %edi
	movl		ARG2+8(%esp), %esi
	movl		ARG3+8(%esp), %eax
	shrl		$2,           %eax

1:
	movq		(%esi), %mm0
	addl		$8,     %esi

	movq		%mm0,   %mm1
	movq		%mm0,   %mm2
	punpcklwd	%mm0,   %mm1
	punpckhwd	%mm0,   %mm2

//	movntq		%mm1,   (%edi)
//	movntq		%mm2,  8(%edi)

	movq		%mm1,   (%edi)
	movq		%mm2,  8(%edi)

	addl		$16,    %edi

	decl		%eax
	jnz			1b

	emms

	popl	%edi
	popl	%esi
	ret


/*-----------------------------------------------------------------------------
void
MMX_Scale2x(
	Uint16*		pDst,
	Uint16*		pSrc,
	Sint32		srcW,
	Sint32		srcH,
	Sint32		dstPitch,
	Sint32		srcPitch);

Note: slower than C version
-----------------------------------------------------------------------------*/
.align 4, 0x90
_MMX_Scale2x:
	pushl		%esi							/* %esp -= 4 */
	pushl		%edi							/* %esp -= 4 */
	pushl		%ebx							/* %esp -= 4 */
	pushl		%ecx							/* %esp -= 4 */
	pushl		%edx							/* %esp -= 4 */

	movl		ARG4+20(%esp), %eax				/* srcH */
	movl		ARG5+20(%esp), %edx				/* dstPitch */
	movl		ARG6+20(%esp), %ebx				/* srcPitch */

	shll		$1,            %edx
	shll		$1,            %ebx

	shrl		$2,            ARG3+20(%esp)	/* srcW /= 4 */

1:	
	// update dst and counter for the first line
	movl		ARG1+20(%esp), %edi				/* pDst */
	movl		ARG2+20(%esp), %esi				/* pSrc */
	addl		%edx,          ARG1+20(%esp)	/* pDst += dstPitch */

	movl		ARG3+20(%esp), %ecx				/* srcW / 4 */
2:
		// first line magnification loop
		movq		(%esi), %mm0

		movq		%mm0,   %mm1
		movq		%mm0,   %mm2
		punpcklwd	%mm0,   %mm1
		punpckhwd	%mm0,   %mm2
		movq		%mm1,   (%edi)
		movq		%mm2,  8(%edi)

		addl		$16,    %edi
		addl		$8,     %esi

		decl		%ecx
		jnz			2b

		// update src, dst, and counter for the second line
		movl		ARG1+20(%esp), %edi				/* pDst */
		addl		%edx,          ARG1+20(%esp)	/* pDst += dstPitch */
		addl		%ebx,          ARG2+20(%esp)	/* pSrc += srcPitch */

		movl		ARG3+20(%esp), %ecx				/* srcW / 4 */
3:
		// second line magnification loop
		movq		(%esi), %mm0

		movq		%mm0,   %mm1
		movq		%mm0,   %mm2
		punpcklwd	%mm0,   %mm1
		punpckhwd	%mm0,   %mm2
		movq		%mm1,   (%edi)
		movq		%mm2,  8(%edi)

		addl		$16,    %edi
		addl		$8,     %esi

		decl		%ecx
		jnz			3b

	decl			%eax
	jnz				1b

	emms

	popl	%edx
	popl	%ecx
	popl	%ebx
	popl	%edi
	popl	%esi
	ret


.align 4, 0x90

/*
_Reg_A:      .LONG 0x00000000



	Rx = ((srcW-1) << 16) / destW;
	Ry = ((srcH-1) << 16) / destH;

	Isx = Isy = 0;
*/

.align 4, 0x90
_MMX_BilinearInterpolationRGB555:
	pushl	%esi
	pushl	%edi
/*
	movl	ARG1(%esp), %edi
	movl	ARG2(%esp), %esi

	movq	ARG2(%esp), %mm0
	movq	ARG1(%esp), %mm0
	emms
*/
	popl	%edi
	popl	%esi
	ret


